Manual curation trials

Published

July 14, 2025

Comparing results of different TEammo curators

Run BLASTs between curated libraries

mkdir -p results

# Make the blastdbs - they are output in same folder as the fasta input
conda activate MCHelper
while IFS=',' read -r species strain marta dean; do
  makeblastdb -in ${marta} -dbtype "nucl"
done < <(sed '1d' curated_libs.csv)

# Run BLASTS - reciprocal not needed
while IFS=',' read -r species strain marta dean; do
  blastn -query ${dean} -db ${marta} -outfmt 6 -max_hsps 1 -out results/${species}_${strain}_dean_vs_marta.blast.out
done < <(sed '1d' curated_libs.csv)

Visualisation

if(!require("BiocManager", character.only = T)) install.packages("BiocManager")
pkgs.list <- readLines("r-requirements.txt")

for (i in 1:length(pkgs.list)){
  if(!require(pkgs.list[i], character.only = T)){
    BiocManager::install(pkgs.list[i], Ncpus = ceiling(parallel::detectCores()/1.7))
    require(pkgs.list[i], character.only = TRUE)
  }else{require(pkgs.list[1],character.only = TRUE)}
}

Load and overall comparisons

setwd("/home/csic/gcy/jgp/extra_storage/dean/mctrials/mctrials")
source("scripts/mccompare_functions.R")

# Load the TE classifications
teClassification <- read.table("orozco_classification-2024_mchelper.csv", sep = ";", header = TRUE)
teClassification <- teClassification %>%
  mutate(AppName = paste(Class, Order, Superfamily, sep ="/") %>% sub("/$", "", .) %>% sub("/$", "", .))
teClassification <- rbind(
  teClassification
  , c("", "Unclassified", "", "Unclassified")
)

# Load libraries to compare

# Drosophila birchii
lib_Dbir_marta <- teReadLib(
  "../../data/final_libraries/D.birchii/D.birchii.TE_library.fasta",
  libIdentifier = "Marta",
  species = "D.birchii",
  strain = "v1.0_00_GenBank")

lib_Dbir_dean <- teReadLib(
  "data/MCH_output/D.birchii/v1.0_00_GenBank/D.birchii_v1.0_00_GenBank_curated-TE-library.fa",
  libIdentifier = "Dean",
  species = "D.birchii",
  strain = "v1.0_00_GenBank")

# Drosophila merina
lib_Dmer_marta <- teReadLib(
  "../../data/final_libraries/D.merina/D.merina.TE_library.fasta",
  libIdentifier = "Marta",
  species = "D.merina", strain = "NA")

lib_Dmer_dean <- teReadLib(
  "data/MCH_output/D.merina/NA/D.merina_NA_curated-TE-library.fa",
  libIdentifier = "Dean",
  species = "D.merina", strain = "NA")

# Drosophila santomea
lib_Dsan_marta <- teReadLib(
  "../../data/final_libraries/D.birchii/D.birchii.TE_library.fasta",
  libIdentifier = "Marta",
  species = "D.santomea",
  strain = "STO_CAGO_1482_RefSeq")

lib_Dsan_dean <- teReadLib(
  "data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/D.santomea_STO_CAGO_1482_RefSeq_curated-TE-library.fa",
  libIdentifier = "Dean",
  species = "D.santomea",
  strain = "STO_CAGO_1482_RefSeq")


# Drosophila tristis
lib_Dtri_marta <- teReadLib(
  "../../data/final_libraries/D.tristis/D.tristis.TE_library.fasta",
  libIdentifier = "Marta",
  species = "D.tristis", strain = "nanopore_D2")

lib_Dtri_dean <- teReadLib(
  "data/MCH_output/D.tristis/nanopore_D2/D.tristis_nanopore_D2_curated-TE-library.fa",
  libIdentifier = "Dean",
  species = "D.tristis", strain = "nanopore_D2")
# Load the blast results comparing curated libraries

blast_Dbir_dean_vs_marta <- LoadBlastComparison(
  blast_out = "results/D.birchii_v1.0_00_GenBank_dean_vs_marta.blast.out",
  blast_query_lib = lib_Dbir_dean,
  blast_subject_lib = lib_Dbir_marta,
  species = "D.birchii",
  strain = "v1.0_00_GenBank",
  comparison = "dean_vs_marta")
   
blast_Dmer_dean_vs_marta <- LoadBlastComparison(
  blast_out = "results/D.merina_NA_dean_vs_marta.blast.out",
  blast_query_lib = lib_Dmer_dean,
  blast_subject_lib = lib_Dmer_marta,
  species = "D.merina",
  strain = "NA",
  comparison = "dean_vs_marta")

blast_Dsan_dean_vs_marta <- LoadBlastComparison(
  blast_out = "results/D.santomea_STO_CAGO_1482_RefSeq_dean_vs_marta.blast.out",
  blast_query_lib = lib_Dsan_dean,
  blast_subject_lib = lib_Dsan_marta,
  species = "D.santomea",
  strain = "STO_CAGO_1482_RefSeq",
  comparison = "dean_vs_marta")

blast_Dtri_dean_vs_marta <- LoadBlastComparison(
  blast_out = "results/D.tristis_nanopore_D2_dean_vs_marta.blast.out",
  blast_query_lib = lib_Dtri_dean,
  blast_subject_lib = lib_Dtri_marta,
  species = "D.tristis",
  strain = "nanopore_D2",
  comparison = "dean_vs_marta")

# Bring them all together for overall
blast_all <- bind_rows(
  blast_Dbir_dean_vs_marta,
  blast_Dmer_dean_vs_marta,
  blast_Dsan_dean_vs_marta,
  blast_Dtri_dean_vs_marta
)

How do the libraries compare overall?

lib1 <- width(lib_Dtri_dean) %>%
  data.frame(length = .)
lib1$curator <- "dean"

lib2 <- width(lib_Dtri_marta) %>%
  data.frame(length = .)
lib2$curator <- "marta"

bind_rows(lib1, lib2) %>%
    ggplot(aes(x = length)) +
      geom_density(aes(fill = curator), alpha = 0.5, color = "black") +
      labs(title = "Density Plot of DNA Sequence Lengths",
           x = "Sequence Length (bp)",
           y = "Density") +
      theme_minimal()

tePlotLib(list(lib_Dbir_dean, lib_Dbir_marta))
tePlotLib(list(lib_Dmer_dean, lib_Dmer_marta))
tePlotLib(list(lib_Dsan_dean, lib_Dsan_marta))
tePlotLib(list(lib_Dtri_dean, lib_Dtri_marta))

How does agreement differ across TE classes?

TE libraries compared by sequence similarity and classification agreement. seq_match (x axis) represents the BLASTn pident score, categorised as Perfect, Present 80, Present 70, and Missing (< 70, missing from either query or subject)

PlotBlastTileMatches(blast_Dbir_dean_vs_marta)
Figure 1
PlotBlastTileMatches(blast_Dmer_dean_vs_marta)
Figure 2
PlotBlastTileMatches(blast_Dsan_dean_vs_marta)
Figure 3
PlotBlastTileMatches(blast_Dtri_dean_vs_marta)
Figure 4
PlotBlastViolMatches(blast_Dbir_dean_vs_marta)
Figure 5
PlotBlastViolMatches(blast_Dmer_dean_vs_marta)
Figure 6
PlotBlastViolMatches(blast_Dsan_dean_vs_marta)
Figure 7
PlotBlastViolMatches(blast_Dtri_dean_vs_marta)
Figure 8